{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import xgboost as xgb\n",
    "from sklearn.cross_validation import train_test_split,StratifiedKFold\n",
    "from sklearn.metrics import *\n",
    "from sklearn.linear_model import LogisticRegression,LogisticRegressionCV\n",
    "from sklearn import svm \n",
    "from sklearn.naive_bayes import BernoulliNB,GaussianNB\n",
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "from sklearn.neural_network import MLPClassifier\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### GBDT模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<function __main__.model_xgb>"
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def model_xgb(train,test):\n",
    "    #数据准备\n",
    "    dtrain = xgb.DMatrix(train.iloc[:, :-1] , label=train.iloc[:, -1] )\n",
    "    dval = xgb.DMatrix(test.iloc[:, :-1], label=test.iloc[:, -1])\n",
    "\n",
    "    #log\n",
    "    params = {\n",
    "        'booster': 'gbtree',\n",
    "        'objective': 'binary:logistic',\n",
    "        'scale_pos_weight': 1100 / 200, #类别权重\n",
    "        'eval_metric': 'auc',\n",
    "        'gamma': 0.01,\n",
    "        'silent': 1,\n",
    "        'max_depth': 5,\n",
    "        'lambda': 0.1,\n",
    "        'subsample': 0.8,\n",
    "        'colsample_bytree': 0.8,\n",
    "        'min_child_weight': 0.1,\n",
    "        'eta': 0.016,\n",
    "        'nthread': 5\n",
    "    }\n",
    "\n",
    "    evals_result = {}\n",
    "    watchlist = [(dtrain,'train'),(dval,'val')]\n",
    "    xgb_model = xgb.train(params,dtrain,num_boost_round=1000,early_stopping_rounds=200,verbose_eval=False,\n",
    "                          evals=watchlist,evals_result=evals_result)\n",
    "     \n",
    "    result = max(evals_result['val']['auc'])\n",
    "    return xgb_model,result\n",
    "model_xgb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# model_xgb,auc = model_xgb(train)\n",
    "# y_pro = model_xgb.predict(test_ma)\n",
    "# # df = pro_optimization(y_pro,test.iloc[:,-1])\n",
    "# ks_plot(test.iloc[:,-1],y_pro,'GBDT')\n",
    "# #特征权重\n",
    "# dic = model.get_fscore()\n",
    "# fea_rank = pd.DataFrame.from_dict([dic])\n",
    "# fea_rank.T.sort_values(by=0,ascending= False).to_csv('./data/features_rank_model_nor.csv')\n",
    "# fea_rank.T.sort_values(by=0,ascending= False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "### LR模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "LogisticRegressionCV(Cs=10, class_weight='balanced', cv=None, dual=False,\n",
       "           fit_intercept=True, intercept_scaling=1.0, max_iter=100,\n",
       "           multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,\n",
       "           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0)"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#class_weight = {1:0.9,0:0.1}\n",
    "def model_lr(train_X,train_Y):\n",
    "    lg = LogisticRegressionCV(class_weight='balanced')\n",
    "    lg.fit(train_X,train_Y)  \n",
    "    return lg\n",
    "lg = LogisticRegressionCV(class_weight='balanced')\n",
    "lg"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# y_pre = lg.predict(val_X)\n",
    "# print 'accuracy:',accuracy_score(val_Y,y_pre)\n",
    "# print 'precision:',precision_score(val_Y,y_pre)\n",
    "# print 'recall:',recall_score(val_Y,y_pre)\n",
    "# print 'f1_score:',f1_score(val_Y,y_pre)\n",
    "\n",
    "# y_pro_lr = lg.predict_proba(val_X)\n",
    "# y_pro_lr = [x[1] for x in y_pro_lr]\n",
    "# ks_plot(val_Y,y_pro_lr,'LR')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### SVM 模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,\n",
       "  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',\n",
       "  max_iter=-1, probability=True, random_state=None, shrinking=True,\n",
       "  tol=0.001, verbose=False)"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def model_svm(train_X,train_Y):\n",
    "    m_svm = svm.SVC(class_weight='balanced',probability=True)\n",
    "    m_svm.fit(train_X,train_Y)\n",
    "    return m_svm\n",
    "svm.SVC(class_weight='balanced',probability=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# y_pro_svm = model_svm.predict_proba(val_X)\n",
    "# y_pro_svm = [x[1] for x in y_pro_svm]\n",
    "# ks_plot(val_Y,y_pro_svm,'SVM')\n",
    "# svm.SVC(class_weight='balanced',probability=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### NB模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "GaussianNB(priors=None)"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def model_NB(train_X,train_Y):\n",
    "    nb = GaussianNB()\n",
    "    nb.fit(train_X,train_Y)\n",
    "    return nb\n",
    "GaussianNB()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### KNN模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n",
       "           metric_params=None, n_jobs=1, n_neighbors=10, p=2,\n",
       "           weights='uniform')"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def model_KNN(train_X,train_Y):\n",
    "    knn = KNeighborsClassifier(n_neighbors=10)\n",
    "    knn.fit(train_X,train_Y)\n",
    "    return knn\n",
    "KNeighborsClassifier(n_neighbors=10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Neural Network"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "MLPClassifier(activation='logistic', alpha=0.0001, batch_size='auto',\n",
       "       beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,\n",
       "       hidden_layer_sizes=(100,), learning_rate='constant',\n",
       "       learning_rate_init=0.001, max_iter=200, momentum=0.9,\n",
       "       nesterovs_momentum=True, power_t=0.5, random_state=None,\n",
       "       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,\n",
       "       verbose=False, warm_start=False)"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def model_NN(train_X,train_Y):\n",
    "    nn = MLPClassifier(activation='logistic')\n",
    "    nn.fit(train_X,train_Y)\n",
    "    return nn\n",
    "MLPClassifier(activation='logistic')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### DT模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,\n",
       "            max_features=None, max_leaf_nodes=None,\n",
       "            min_impurity_split=1e-07, min_samples_leaf=1,\n",
       "            min_samples_split=2, min_weight_fraction_leaf=0.0,\n",
       "            presort=False, random_state=None, splitter='best')"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def model_DT(train_X,train_Y):\n",
    "    dt = DecisionTreeClassifier(class_weight='balanced',max_depth='8')\n",
    "    dt.fit(train_X,train_Y)\n",
    "    return dt\n",
    "DecisionTreeClassifier()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 加载数据\n",
    "- 5折交叉验证"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#订单样本\n",
    "# samples = pd.read_csv('./new_out/samples_all_fillna_nor.csv')  #所有客户\n",
    "# samples = pd.read_csv('./data/samples_all_fillna.csv',index_col=0)\n",
    "# samples = pd.read_csv('./data/samples_pca.csv',index_col=0)\n",
    "# samples = pd.read_csv('./new_out/old_custom_samples_all_fillna_nor.csv',index_col=0)  #老客户\n",
    "\n",
    "# cols_filter =['实收资本','注册时间','流动比率','资产负债率','存货周转天数','净资产','主营业务利润率','应收账款周转天数',\n",
    "#               '下游客户情况','企业征信','涉诉信息','有无贷款','提前还款占比','逾期占比','平均逾期天数','5-10天占比','4天以内占比',\n",
    "#               '账期订单交易量均值','交易量方差','交易量最小值','交易额最小值','交易量均值','采购频率（订单总数/合作月数）','供应商数量',\n",
    "#               '交易额均值','账期订单交易额均值','交易额方差','交易稳定性','交易额最大值','合作时间（月）',\n",
    "#               '账期订单占比','交易额总量','交易总量','交易量最大值','11天以上占比','最近连续交易月数','账期合作时间（月）',\n",
    "#               '采购聚集程度（订单总数/供应商数）','订单总数','label']\n",
    "# samples = samples[cols_filter]\n",
    "# samples.shape,samples.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#不合并企业征信\n",
    "# samples = pd.read_csv('./data/samples_all_fillna_nor.csv',index_col=0)\n",
    "# cols_filter =['实收资本','注册时间','流动比率','资产负债率','存货周转天数','净资产','主营业务利润率','应收账款周转天数',\n",
    "#               '下游客户情况','征信不良', '征信关注', '征信欠息','征信正常','涉诉信息','有无贷款','提前还款占比','逾期占比',\n",
    "#               '5-10天占比','4天以内占比',\n",
    "#               '账期订单交易量均值','交易量方差','交易量最小值','交易额最小值','交易量均值','采购频率（订单总数/合作月数）','供应商数量',\n",
    "#               '交易额均值','账期订单交易额均值','交易额方差','交易稳定性（近三个月交易量变化均值）','交易额最大值','合作时间（月）',\n",
    "#               '账期订单占比','交易额总量','交易总量','交易量最大值','11天以上占比','最近连续交易月数','账期合作时间（月）',\n",
    "#               '采购聚集程度（订单总数/供应商数）','订单总数','label']\n",
    "# samples = samples[cols_filter]\n",
    "# samples.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#分账期\n",
    "# samples =pd.read_csv('./data/samples_7_fillna_nor.csv',index_col=0)\n",
    "# samples =pd.read_csv('./data/samples_15_fillna_nor.csv')\n",
    "# samples =pd.read_csv('./data/samples_30_fillna_nor.csv')\n",
    "# cols_filter =['实收资本','注册时间','流动比率','资产负债率','存货周转天数','净资产','主营业务利润率','应收账款周转天数',\n",
    "#               '下游客户情况','征信不良', '征信关注', '征信欠息', '征信正常','涉诉信息','有无贷款','提前还款占比','逾期占比',\n",
    "#               '5-10天占比','账期订单交易量均值',\n",
    "#               '交易量方差','交易量最小值','4天以内占比','交易额最小值','交易量均值','采购频率（订单总数/合作月数）','供应商数量',\n",
    "#               '交易额均值','账期订单交易额均值','交易额方差','交易稳定性（近三个月交易量变化均值）','交易额最大值','合作时间（月）',\n",
    "#               '账期订单占比','交易额总量','交易总量','交易量最大值','11天以上占比','最近连续交易月数','账期合作时间（月）',\n",
    "#               '采购聚集程度（订单总数/供应商数）','订单总数','label']\n",
    "# samples = samples[cols_filter]\n",
    "# samples.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#变量筛选分段测试\n",
    "# samples =pd.read_csv('./data/samples_sub_fillna_nor.csv',index_col=0)\n",
    "# samples.head(),samples.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#以客户为样本\n",
    "# samples =pd.read_csv('./data/samples_kehu_all_fillna_nor.csv',index_col=0)\n",
    "# cols_filter =['实收资本','注册时间','流动比率','资产负债率','存货周转天数','净资产','主营业务利润率','应收账款周转天数',\n",
    "#               '下游客户情况','企业征信','涉诉信息','有无贷款','账期订单交易量均值','交易量方差','交易量最小值','交易额最小值',\n",
    "#               '交易量均值','采购频率（订单总数/合作月数）','供应商数量',\n",
    "#               '交易额均值','账期订单交易额均值','交易额方差','交易稳定性（近三个月交易量变化均值）','交易额最大值','合作时间（月）',\n",
    "#               '账期订单占比','交易额总量','交易总量','交易量最大值','最近连续交易月数','账期合作时间（月）',\n",
    "#               '采购聚集程度（订单总数/供应商数）','订单总数','label']\n",
    "# samples = samples[cols_filter]\n",
    "# samples.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#求ks值\n",
    "def ks_value(y_true,y_pre):\n",
    "    fpr,tpr,t = roc_curve(y_true,y_pre)\n",
    "    gap =tpr-fpr\n",
    "    ks_value = np.max(gap)\n",
    "    auc_value = auc(fpr,tpr)\n",
    "    return ks_value,auc_value\n",
    "    \n",
    "def ks_plot(y_true,y_pre,model_name=None):\n",
    "    plt.figure()\n",
    "    fpr,tpr,t = roc_curve(y_true,y_pre)\n",
    "    plt.plot(fpr,label='False Positive Rate')\n",
    "    plt.plot(tpr,label='Ture Positive Rate')\n",
    "\n",
    "    gap = tpr-fpr\n",
    "    ks_value = np.max(gap)\n",
    "    ks_index = np.where(gap == ks_value)\n",
    "    y = [fpr[ks_index], tpr[ks_index]]\n",
    "    x = [int(ks_index[0]), int(ks_index[0])]\n",
    "    \n",
    "    plt.plot(x,y,label='KS_max=%.2f'%ks_value)\n",
    "\n",
    "    plt.title(\"%s KS graph\\n th=%0.3f\" % (model_name,t[ks_index]))\n",
    "    plt.legend(loc=\"mid right\")\n",
    "    if model_name:\n",
    "        plt.savefig('./new_figure/%s_%.2f.png'%(model_name,ks_value))\n",
    "    plt.show()\n",
    "    \n",
    "def auc_plot(y_true,y_pre,model_name=None):\n",
    "    plt.figure()\n",
    "    fpr,tpr,th = roc_curve(y_true, y_pre)\n",
    "    auc_value = auc(fpr,tpr)\n",
    "    plt.plot(fpr,tpr,lw=1,label=\"ROC fold (AUC=%.2f)\" % (auc_value))\n",
    "    plt.xlim([-0.05,1.05])\n",
    "    plt.ylim([0, 1.05])\n",
    "    plt.xlabel(\"False Positive Rate\")\n",
    "    plt.ylabel(\"True Positive Rate\")\n",
    "    plt.title(\"%s ROC curve\"%model_name)\n",
    "    plt.legend(loc='lower right')\n",
    "    if model_name:\n",
    "        plt.savefig('./new_figure/%s_%.2f.png'%(model_name,auc_value))\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#GBDT\n",
    "def GBDT_CV():\n",
    "    ks_list = []\n",
    "    for train,test in skf:\n",
    "        train_data = samples.iloc[train]\n",
    "        test_data = samples.iloc[test]\n",
    "        test_ma=xgb.DMatrix(test_data.iloc[:,:-1],test_data.iloc[:,-1])\n",
    "        gbdt,auc = model_xgb(train_data,test_data)\n",
    "        y_pro = gbdt.predict(test_ma)\n",
    "        ks = ks_value(test_data.iloc[:,-1],y_pro)\n",
    "        ks_list.append(ks)\n",
    "    print ks_list\n",
    "    return np.mean([x[0] for x in ks_list])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def model_CV(model_name):\n",
    "    ks_list = []\n",
    "#     weight = []\n",
    "#     bias =[]\n",
    "    for train,test in skf:\n",
    "        train_data = samples.iloc[train]\n",
    "        test_data = samples.iloc[test]\n",
    "        model = model_name(train_data.iloc[:,:-1],train_data.iloc[:,-1])\n",
    "        y_pro = model.predict_proba(test_data.iloc[:,:-1])\n",
    "        y_pro = [x[1] for x in y_pro]\n",
    "        ks = ks_value(test_data.iloc[:,-1],y_pro)        \n",
    "        ks_list.append(ks)\n",
    "#         weight.append(list(model.coef_[0]))\n",
    "#         bias.append(list(model.intercept_))\n",
    "    print ks_list\n",
    "    return np.mean([x[0] for x in ks_list])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1391, 46)"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#季度统计订单样本\n",
    "# samples = pd.read_csv('./new_out/3m_samples_all_fillna_nor.csv')\n",
    "# samples =pd.read_csv('./new_out/3m_samples_all_add_fillna_nor.csv')\n",
    "# samples =pd.read_csv('./new_out/3m_samples_woe_fillna_nor.csv') #所有变量，离散变量先缺失值填充0，再woe编码\n",
    "# samples =pd.read_csv('./new_out/3m_samples_woe.csv') #所有变量woe编码\n",
    "# samples = pd.read_csv('./new_out/old_custom_samples_3m_fillna_nor.csv')#老客户\n",
    "# samples = pd.read_csv('./out1/samples_41_fillna_mean.csv',index_col=0) #离散变量woe编码，连续变量归一化，所有变量缺失值填均值\n",
    "samples = pd.read_csv('./out1/samples_for_card_fillna_mean.csv',index_col=0)\n",
    "skf = StratifiedKFold(samples.iloc[:,-1],n_folds=5)\n",
    "samples.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "sklearn.cross_validation.StratifiedKFold(labels=[1 0 0 ..., 0 0 0], n_folds=5, shuffle=False, random_state=None)"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "skf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 128,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(0.35531775496833373, 0.61356191308145891), (0.60098687899517778, 0.87226645732869801), (0.29595155321296401, 0.65633060446338454), (0.29213861164068633, 0.61775260737916349), (0.48715935852865316, 0.79735336996747785)]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.40631083146916297"
      ]
     },
     "execution_count": 128,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lr_ks = model_CV(model_lr)\n",
    "lr_ks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(0.39266215330858267, 0.63452718934265118), (0.59952898957048339, 0.86419199282269821), (0.35022989794774034, 0.64657395985196808), (0.3689581697880453, 0.66081641807782887), (0.474486935067848, 0.77447572053381186)]\n",
      "0.437173229137\n"
     ]
    }
   ],
   "source": [
    "nn =model_CV(model_NN)\n",
    "print nn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 134,
   "metadata": {
    "collapsed": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[]\n"
     ]
    },
    {
     "ename": "ValueError",
     "evalue": "Cannot set a frame with no defined index and a value that cannot be converted to a Series",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-134-07480eb4cad8>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[1;32mprint\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmean\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmat\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mweight\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      2\u001b[0m \u001b[0mtmp\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mweight\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mT\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m \u001b[0mtmp\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;34m'weight_avg'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmean\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmat\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mweight\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mT\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      4\u001b[0m \u001b[0mtmp\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'col'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msamples\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0miloc\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m-\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      5\u001b[0m \u001b[0mtmp\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m/home/sf/anaconda2/lib/python2.7/site-packages/pandas/core/frame.pyc\u001b[0m in \u001b[0;36m__setitem__\u001b[1;34m(self, key, value)\u001b[0m\n\u001b[0;32m   2427\u001b[0m         \u001b[0mAssign\u001b[0m \u001b[0mnew\u001b[0m \u001b[0mcolumns\u001b[0m \u001b[0mto\u001b[0m \u001b[0ma\u001b[0m \u001b[0mDataFrame\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mreturning\u001b[0m \u001b[0ma\u001b[0m \u001b[0mnew\u001b[0m \u001b[0mobject\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   2428\u001b[0m         \u001b[1;33m(\u001b[0m\u001b[0ma\u001b[0m \u001b[0mcopy\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mwith\u001b[0m \u001b[0mall\u001b[0m \u001b[0mthe\u001b[0m \u001b[0moriginal\u001b[0m \u001b[0mcolumns\u001b[0m \u001b[1;32min\u001b[0m \u001b[0maddition\u001b[0m \u001b[0mto\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mnew\u001b[0m \u001b[0mones\u001b[0m\u001b[1;33m.\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 2429\u001b[1;33m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   2430\u001b[0m         \u001b[1;33m.\u001b[0m\u001b[1;33m.\u001b[0m \u001b[0mversionadded\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;36m0.16\u001b[0m\u001b[1;36m.0\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   2431\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m/home/sf/anaconda2/lib/python2.7/site-packages/pandas/core/frame.pyc\u001b[0m in \u001b[0;36m_set_item\u001b[1;34m(self, key, value)\u001b[0m\n\u001b[0;32m   2492\u001b[0m         \u001b[0mdata\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcopy\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   2493\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 2494\u001b[1;33m         \u001b[1;31m# do all calculations first...\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   2495\u001b[0m         \u001b[0mresults\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m{\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   2496\u001b[0m         \u001b[1;32mfor\u001b[0m \u001b[0mk\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mv\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m/home/sf/anaconda2/lib/python2.7/site-packages/pandas/core/frame.pyc\u001b[0m in \u001b[0;36m_ensure_valid_index\u001b[1;34m(self, value)\u001b[0m\n\u001b[0;32m   2474\u001b[0m         \u001b[1;36m9\u001b[0m  \u001b[1;36m10\u001b[0m \u001b[1;33m-\u001b[0m\u001b[1;36m0.758542\u001b[0m  \u001b[1;36m2.302585\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   2475\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 2476\u001b[1;33m         \u001b[0mWhere\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mvalue\u001b[0m \u001b[0malready\u001b[0m \u001b[0mexists\u001b[0m \u001b[1;32mand\u001b[0m \u001b[1;32mis\u001b[0m \u001b[0minserted\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   2477\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   2478\u001b[0m         \u001b[1;33m>>\u001b[0m\u001b[1;33m>\u001b[0m \u001b[0mnewcol\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlog\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'A'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mValueError\u001b[0m: Cannot set a frame with no defined index and a value that cannot be converted to a Series"
     ]
    }
   ],
   "source": [
    "print np.mean(np.mat(weight), axis=0)\n",
    "tmp = pd.DataFrame(weight).T\n",
    "tmp ['weight_avg'] = np.mean(np.mat(weight), axis=0).T\n",
    "tmp['col'] = list(samples.iloc[:,:-1].columns)\n",
    "tmp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(0.39801266652107453, 0.72532212273422147), (0.4872715038690143, 0.76180329707300665), (0.3605472692609622, 0.68649770102052265), (0.47493551642929244, 0.75714926544802064), (0.43927329819445998, 0.75485028597061798)]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.43200805085496069"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "svm_ks =model_CV(model_svm)\n",
    "svm_ks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "nb_ks = model_CV(model_NB)\n",
    "nb_ks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "knn = model_CV(model_KNN)\n",
    "knn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "dt =model_CV(model_DT)\n",
    "dt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(0.42683992138021398, 0.71331076654291325), (0.2715038690142425, 0.6613210721094539), (0.32533363238757435, 0.66485365033082866), (0.4956824043960974, 0.77800829875518673), (0.3396882359537961, 0.69373107547381407)]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.3718096126263849"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gbdt_ks =GBDT_CV()\n",
    "gbdt_ks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# from sklearn import cross_validation\n",
    "# cv = StratifiedKFold(samples.iloc[:,-1],n_folds=5)\n",
    "# lg = LogisticRegressionCV(class_weight='balanced')\n",
    "# cv_lr = cross_validation.cross_val_score(lg,samples.iloc[:,:-1],samples.iloc[:,-1],cv=cv)\n",
    "# cv_lr"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 最佳训练数据集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "count=0\n",
    "for train,test in skf:\n",
    "    count +=1\n",
    "    train_data = samples.iloc[train]\n",
    "    test_data = samples.iloc[test]\n",
    "    train_data.to_csv('./data/model_train_data_%d.csv'%count)\n",
    "    test_data.to_csv('./data/model_test_data_%d.csv'%count)\n",
    "train_data = pd.read_csv('./data/model_train_data_5.csv',index_col=0)\n",
    "test_data = pd.read_csv('./data/model_test_data_5.csv',index_col=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "lr = model_lr(train_data.iloc[:,:-1],train_data.iloc[:,-1])\n",
    "y_pro = lr.predict_proba(test_data.iloc[:,:-1])\n",
    "y_pro = [x[1] for x in y_pro]\n",
    "ks_plot(test_data.iloc[:,-1],y_pro,'LR_ks_3m_sub_woe')\n",
    "auc_plot(test_data.iloc[:,-1],y_pro,'LR_auc_3m_sub_woe')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "m_svm = model_svm(train_data.iloc[:,:-1],train_data.iloc[:,-1])\n",
    "y_pro = m_svm.predict_proba(test_data.iloc[:,:-1])\n",
    "y_pro = [x[1] for x in y_pro]\n",
    "ks_plot(test_data.iloc[:,-1],y_pro,'svm_ks_3m_sub_woe')\n",
    "auc_plot(test_data.iloc[:,-1],y_pro,'svm_auc_3m_sub_woe')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "nn = MLPClassifier(activation='logistic')\n",
    "nn.fit(train_data.iloc[:,:-1],train_data.iloc[:,-1])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "nn.n_layers_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
